package cn.mc.scheduler.crawlerOverseas.technologyreview.com;

import cn.mc.core.dataObject.NewsContentOverseasArticleDO;
import cn.mc.core.dataObject.NewsDO;
import cn.mc.core.dataObject.NewsImageDO;
import cn.mc.core.manager.NewsCoreManager;
import cn.mc.core.manager.NewsImageCoreManager;
import cn.mc.core.manager.NewsContentOverseasArticleCoreManager;
import cn.mc.core.utils.DateUtil;
import cn.mc.core.utils.EncryptUtil;
import cn.mc.core.utils.IDUtil;
import cn.mc.scheduler.base.BaseSpider;
import cn.mc.scheduler.crawler.BaseCrawler;
import cn.mc.scheduler.crawlerOverseas.OverseasCrawlerPipeline;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;

import java.util.ArrayList;
import java.util.Date;
import java.util.List;

/**
 * 海外科技评论（下载类）新闻
 *
 * @author xl
 * @date 2018/08/28 下午 15:46
 */
@Component
public class TechnologyReviewCrawler extends BaseCrawler {

    private Site site = Site.me().setRetryTimes(3).setSleepTime(500);

    private static final String URL = "https://www.technologyreview.com/the-download/";

    private static final String REQUEST_URL = "https://www.technologyreview.com";
    @Autowired
    private NewsImageCoreManager newsImageCoreManager;


    @Autowired
    private NewsCoreManager newsCoreManager;
    @Autowired
    private NewsContentOverseasArticleCoreManager overseasNewsContentArticleCoreManager;

    @Autowired
    private OverseasCrawlerPipeline overseasCrawlerPipeline;

    @Override
    public Spider createCrawler() {
        Request request = new Request(URL);
        request.addHeader(HEADER_USER_AGENT_KEY, USER_AGENT_ANDROID_OS);
        BaseSpider spider = new BaseSpider(this);
        spider.addRequest(request);
        return spider;
    }

    @Override
    public void process(Page page) {
        if (URL.contains(page.getUrl().toString())) {
            Html html = page.getHtml();
            //获取列表
            List<Selectable> nodes = html.xpath("//article[@class='download__item']").nodes();
            for(Selectable contentNode:nodes){
                //获取标题
                List<Selectable> titleNodes=contentNode.xpath("//h2[@class='download__headline']/text()").nodes();
                String title;
                if(titleNodes.size()>0){
                    title=titleNodes.get(0).toString();
                }else{//没有标题的丢弃
                    continue;
                }
                //获取源连接
                String newsSourceUrl=REQUEST_URL+contentNode.xpath("//a[@class='read-more']/@href").toString();

                String dataKey= EncryptUtil.encrypt(newsSourceUrl, "md5");
                //来源作者
                String source=contentNode.xpath("//a[@class='download__author']/div[@class='download__author-meta']/p/strong/text()").toString();
                String keywords="";
                //新闻Id
                Long newsId = IDUtil.getNewID();
                String newsUrl = "";
                String shareUrl = "";
                //是否禁止评论 0允许
                Integer banComment=0;
                Integer newsHot = 0;
                Integer videoCount = 0;
                Integer sourceCommentCount=0;
                String imageUrl=contentNode.xpath("//picture[@class='feed-tz__image']/img/@src").toString();
                if(StringUtils.isEmpty(imageUrl)){
                    continue;
                }
                List<NewsImageDO> newsImageDOList = new ArrayList<>();
                newsImageDOList.add(newsImageCoreManager.buildNewsImageDO(
                        IDUtil.getNewID(), newsId,
                        imageUrl, 0, 0,
                        NewsImageDO.IMAGE_TYPE_MINI));
                Integer displayType=NewsDO.DISPLAY_TYPE_ONE_MINI_IMAGE;
                //获取文章发布时间
                String time=contentNode.xpath("//a[@class='download__author']/div[@class='download__author-meta']/time/@datetime").toString();

                time=time.replace("T"," ");
                //发布时间
                Date displayTime = null;
                try {
                    displayTime= DateUtil.parse(time,DateUtil.DATE_FORMAT_YEAR_MONTH_DAY_HOUR_MINUTE_SECOND2);
                }catch (Exception ex){
                    ex.printStackTrace();
                }
                NewsDO newsDO = newsCoreManager.buildNewsDO(
                        newsId, dataKey, title, newsHot,
                        newsUrl, shareUrl, source, newsSourceUrl,
                        NewsDO.NEWS_TYPE_TECHNOLOGY_OVERSEAS,
                        NewsDO.CONTENT_TYPE_IMAGE_TEXT,
                        keywords, banComment,
                        "", displayTime,
                        videoCount, 1, null,
                        displayType, sourceCommentCount, 0);

                //内容
                String content = html.xpath("//div[@class='download__dek download__dek--full']").toString();
                if(StringUtils.isEmpty(content)){
                    continue;
                }
                //构建海外新闻内容结构
                NewsContentOverseasArticleDO newsContentOverseasArticleDO=overseasNewsContentArticleCoreManager.buildOverseasArticleDO(IDUtil.getNewID(),newsId,content,title);
                //添加新闻
                overseasCrawlerPipeline.save(newsDO,newsImageDOList,newsContentOverseasArticleDO);
            }

        }
    }

    @Override
    public Site getSite() {
        return site;
    }

}
